This R Markdown document presents a comprehensive analytical pipeline for single-cell RNA sequencing (scRNA-seq) data analysis utilizing the Seurat v5 framework(Hao et al. 2021). The workflow encompasses essential steps in single-cell transcriptomics analysis, including:
The pipeline is optimized for multi-sample analysis and implements best practices in single-cell data processing to ensure robust and reproducible results.
The following section initializes the computational environment by loading essential R packages for scRNA-seq analysis. Each package serves specific analytical functions:
This section establishes the project infrastructure and imports the raw single-cell data:
project_dir = "D:/Data_Analysis_Practice_2025/scRNAseq/scRNAseq_Analysis_templates"
result_dir = file.path(project_dir, 'results')
dir.create(result_dir, showWarnings = FALSE)
final_fig_dir = file.path(result_dir, 'figures')
dir.create(final_fig_dir, showWarnings = FALSE)
message("Reading data...")
seurat_data <- Read10X_h5("count/cellrange_aggr_filtered_feature_bc_matrix.h5", use.names = TRUE, unique.features = TRUE
)
message("...reading data done!")
seurat_obj <- CreateSeuratObject(counts = seurat_data,
min.features = 100,
min.cells = 1)
# remove seurat_data to save memory
rm(seurat_data)This section implements systematic metadata annotation to facilitate downstream analyses:
The metadata structure is designed to support both sample-specific and condition-wide analyses while maintaining data provenance.
# Add sampleID based on the barcode suffix
seurat_obj@meta.data <- seurat_obj@meta.data %>%
mutate(
sampleID = case_when(
grepl("-1$", rownames(seurat_obj@meta.data)) ~ "C1_BPN2",
grepl("-2$", rownames(seurat_obj@meta.data)) ~ "FXS_BPN2",
grepl("-3$", rownames(seurat_obj@meta.data)) ~ "C1_BPN1",
grepl("-4$", rownames(seurat_obj@meta.data)) ~ "C1_VEH1",
grepl("-5$", rownames(seurat_obj@meta.data)) ~ "FXS_BPN1",
grepl("-6$", rownames(seurat_obj@meta.data)) ~ "FXS_VEH2",
grepl("-7$", rownames(seurat_obj@meta.data)) ~ "C1_VEH2",
grepl("-8$", rownames(seurat_obj@meta.data)) ~ "FXS_VEH1",
)
)
# Create a general condition identifier
seurat_obj@meta.data$orig.ident <- as.character(seurat_obj@meta.data$orig.ident)
seurat_obj@meta.data[grep("-1$|-3$", rownames(seurat_obj@meta.data)), ]$orig.ident <- "C1_BPN"
seurat_obj@meta.data[grep("-2$|-5$", rownames(seurat_obj@meta.data)), ]$orig.ident <- "FXS_BPN"
seurat_obj@meta.data[grep("-4$|-7$", rownames(seurat_obj@meta.data)), ]$orig.ident <- "C1_VEH"
seurat_obj@meta.data[grep("-6$|-8$", rownames(seurat_obj@meta.data)), ]$orig.ident <- "FXS_VEH"
seurat_obj@meta.data$orig.ident <- as.factor(seurat_obj@meta.data$orig.ident)
# Display metadata head and tail
head(seurat_obj@meta.data)
tail(seurat_obj@meta.data)
# Save initial object
saveRDS(seurat_obj, "results/seurat_obj.rds")This section implements comprehensive quality control measures to ensure data reliability:
The quality control parameters are optimized for neural tissue single-cell analysis while maintaining analytical stringency.
seurat_obj <- readRDS(file.path(result_dir, "seurat_obj.rds"))
seurat_obj <- PercentageFeatureSet(seurat_obj, pattern = "^MT-", col.name = "percent.mt")
# Visualize QC metrics
VlnPlot(seurat_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), split.by = "sampleID")Figure 1: Violinplot
This section implements the Harmony algorithm for batch effect correction, a critical step for multi-sample analysis:
The integration strategy is optimized to maintain biological signals while removing technical artifacts.
NML_HAR <- NormalizeData(seurat_obj)
NML_HAR <- FindVariableFeatures(NML_HAR)
NML_HAR <- ScaleData(NML_HAR)
NML_HAR <- RunPCA(NML_HAR, verbose = TRUE)
NML_HAR <- RunHarmony(NML_HAR, group.by.vars = "orig.ident")
NML_HAR <- RunUMAP(NML_HAR, reduction = "harmony", dims = 1:30)
NML_HAR <- FindNeighbors(NML_HAR, reduction = "harmony", dims = 1:30)
NML_HAR <- FindClusters(NML_HAR)
saveRDS(NML_HAR, file.path(result_dir, "NML_HAR.rds"))
# Visualize Harmony results
DimPlot(NML_HAR, group.by = "orig.ident")
rm(NML_HAR)Figure 2: Harmony integration results
This section implements Seurat’s reciprocal PCA (RPCA) integration method, providing an alternative approach to batch correction:
The RPCA approach offers robust integration while preserving unique biological signals present in individual samples.
# Split object by sample and process
seurat_obj[['RNA']] <- split(seurat_obj[['RNA']], f = seurat_obj$sampleID)
seurat_obj <- NormalizeData(seurat_obj)
seurat_obj <- FindVariableFeatures(seurat_obj)
seurat_obj <- ScaleData(seurat_obj)
seurat_obj <- RunPCA(seurat_obj, verbose = TRUE)
# Visualize Elbow Plot
ElbowPlot(seurat_obj, ndims = 50)Figure 3: Elbowplot
# Integrate layers with RPCA
options(future.globals.maxSize = +Inf)
rpca_seurat_obj <- IntegrateLayers(
object = seurat_obj, method = RPCAIntegration,
orig.reduction = "pca", new.reduction = "integrated.rpca",
verbose = FALSE
)
rpca_seurat_obj <- FindNeighbors(rpca_seurat_obj, reduction = "integrated.rpca", dims = 1:30)
rpca_seurat_obj <- FindClusters(rpca_seurat_obj, resolution = 0.5, cluster.name = "rpca_clusters")
rpca_seurat_obj <- RunUMAP(
rpca_seurat_obj,
reduction = "integrated.rpca",
dims = 1:30,
reduction.name = "umap_rpca"
)
# Visualize RPCA results
DimPlot(rpca_seurat_obj, group.by = c("orig.ident", "rpca_clusters"), ncol = 2)
saveRDS(rpca_seurat_obj, file.path(result_dir, "rpca_seurat_obj.rds"))
rm(rpca_seurat_obj)Figure 4: RPCA integration results
This section implements systematic cell population identification and biological annotation:
The clustering approach is designed to capture both major cell types and subtle subpopulations while maintaining biological relevance.
# This is a placeholder for the long loop.
# The original script runs this loop from res 0.1 to 4.0.
# For this Rmd, we will just run for one resolution as an example.
i = 0.7
seurat_obj <- readRDS(file.path(result_dir, "rpca_seurat_obj.rds")) # Using RPCA integrated object
seurat_obj <- FindClusters(seurat_obj, resolution = i, cluster.name = "rpca_clusters")
seurat_obj <- RunUMAP(seurat_obj, reduction = "integrated.rpca", dims = 1:30, reduction.name = "umap_rpca")
# View UMAP
DimPlot(seurat_obj, reduction = "umap_rpca", group.by = "rpca_clusters", label = TRUE)Figure 5: Clustering results at resolution 0.7
This section implements manual cell type annotation based on established molecular signatures and biological knowledge:
The manual annotation process integrates both computational results and domain expertise to ensure biological accuracy.
seurat_obj@meta.data$annotated_celltype <- NA
seurat_obj@meta.data[seurat_obj@meta.data$rpca_clusters %in% c(4), 'annotated_celltype'] = 'radial glia'
seurat_obj@meta.data[seurat_obj@meta.data$rpca_clusters %in% c(7,14,18,15), 'annotated_celltype'] = 'proliferating radial glia'
# View annotated UMAP
DimPlot(seurat_obj, reduction = "umap_rpca", group.by = "annotated_celltype", label = TRUE)
saveRDS(seurat_obj, file.path(result_dir, "annotated_seurat_obj.rds"))Figure 6: Annotated cell types on UMAP
Perform DEG analysis between conditions for specific cell types.
annotated_combined_obj <- readRDS(file.path(result_dir, "annotated_res3.5_combined_res3.0_obj.rds"))
## Example: Find DEGs between FXS_VEH and C1_VEH in radial glia
Idents(annotated_combined_obj) <- "orig.ident"
radial_glia_obj <- subset(annotated_combined_obj, subset = annotated_celltype == "radial glia")
radial_glia_obj <- JoinLayers(radial_glia_obj)
deg_markers <- FindMarkers(
radial_glia_obj,
logfc.threshold = 0.25,
ident.1 = "FXS_VEH",
ident.2 = "C1_VEH"
)
head(deg_markers)
write.csv(deg_markers, file.path(result_dir, "radial_glia_FXS_vs_C1_DEGs.csv"))Perform GO enrichment analysis on the identified differentially expressed genes.
library(clusterProfiler)
library(org.Hs.eg.db)
# Use genes from the previous DEG analysis
genes <- rownames(deg_markers[deg_markers$p_val_adj < 0.05, ])
# Map gene symbols to Entrez IDs
entrez_ids <- mapIds(org.Hs.eg.db, keys = genes, column = "ENTREZID", keytype = "SYMBOL", multiVals = "first")
entrez_ids <- entrez_ids[!is.na(entrez_ids)]
# Run GO enrichment
go_results <- enrichGO(
gene = entrez_ids,
OrgDb = org.Hs.eg.db,
ont = "BP", # Biological Process
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE
)
# Plot results
dotplot(go_results, showCategory=20)Figure 7: GO enrichment analysis results
This chunk provides information about the R session, including package versions.
#> R version 4.5.0 (2025-04-11 ucrt)
#> Platform: x86_64-w64-mingw32/x64
#> Running under: Windows 11 x64 (build 26100)
#>
#> Matrix products: default
#> LAPACK version 3.12.1
#>
#> locale:
#> [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 LC_MONETARY=English_United States.utf8
#> [4] LC_NUMERIC=C LC_TIME=English_United States.utf8
#>
#> time zone: America/New_York
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] htmltools_0.5.8.1 kableExtra_1.4.0 knitr_1.50
#>
#> loaded via a namespace (and not attached):
#> [1] svglite_2.2.1 cli_3.6.5 rlang_1.1.6 xfun_0.53 stringi_1.8.7 showtextdb_3.0
#> [7] sysfonts_0.8.9 assertthat_0.2.1 textshaping_1.0.3 jsonlite_2.0.0 glue_1.8.0 sass_0.4.10
#> [13] scales_1.4.0 rmarkdown_2.30 klippy_0.0.0.9500 evaluate_1.0.5 jquerylib_0.1.4 fastmap_1.2.0
#> [19] yaml_2.3.10 lifecycle_1.0.4 stringr_1.5.2 compiler_4.5.0 RColorBrewer_1.1-3 rstudioapi_0.17.1
#> [25] systemfonts_1.3.1 farver_2.1.2 digest_0.6.37 viridisLite_0.4.2 R6_2.6.1 dichromat_2.0-0.1
#> [31] showtext_0.9-7 magrittr_2.0.4 bslib_0.9.0 tools_4.5.0 xml2_1.4.0 cachem_1.1.0